{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0",
   "metadata": {
    "id": "da11e5a8"
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import warnings\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.base import BaseEstimator\n",
    "warnings.simplefilter('ignore')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1",
   "metadata": {
    "id": "f815a26a"
   },
   "source": [
    "We import a clone of hdmpy to use rlasso functions in python\n",
    "```\n",
    "!pip install multiprocess\n",
    "!pip install pyreadr\n",
    "!git clone https://github.com/maxhuppertz/hdmpy.git\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "q9tTqKaPflDN",
    "outputId": "668fb280-4307-4bfd-b9f6-7d779a98a7d9"
   },
   "outputs": [],
   "source": [
    "!pip install multiprocess\n",
    "!pip install pyreadr\n",
    "!git clone https://github.com/maxhuppertz/hdmpy.git"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3",
   "metadata": {
    "id": "d465905f"
   },
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.insert(1, \"./hdmpy\")\n",
    "import hdmpy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4",
   "metadata": {
    "id": "d5d475d7"
   },
   "outputs": [],
   "source": [
    "# An estimator class that runs the lasso with theoretically driven penalty choice.\n",
    "# Better in small samples than cross-validation and also faster computationally\n",
    "\n",
    "class RLasso(BaseEstimator):\n",
    "\n",
    "    def __init__(self, *, post=False):\n",
    "        self.post = post\n",
    "\n",
    "    def fit(self, X, y):\n",
    "        self.rlasso_ = hdmpy.rlasso(X, y, post=self.post)\n",
    "        return self\n",
    "\n",
    "    @property\n",
    "    def coef_(self):\n",
    "        return np.array(self.rlasso_.est['beta']).flatten()\n",
    "\n",
    "    def predict(self, X):\n",
    "        return X @ self.coef_ + np.array(self.rlasso_.est['intercept'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5",
   "metadata": {
    "id": "b8c3a418"
   },
   "outputs": [],
   "source": [
    "# A simple experimental data generating process. No effect heterogeneity.\n",
    "def gen_data(n, d, p, delta, base):\n",
    "    X = np.random.normal(0, 1, size=(n, d))\n",
    "    D = np.random.binomial(1, p, size=(n,))\n",
    "    y0 = base - X[:, 0] + np.random.normal(0, .1, size=(n,))\n",
    "    y1 = delta + base - X[:, 0] + np.random.normal(0, .1, size=(n,))\n",
    "    y = y1 * D + y0 * (1 - D)\n",
    "    return y, D, X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6",
   "metadata": {
    "id": "258d88f2"
   },
   "outputs": [],
   "source": [
    "n = 100  # n samples\n",
    "d = 100  # n features\n",
    "delta = 1.0  # treatment effect\n",
    "base = .3  # baseline outcome"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7",
   "metadata": {
    "id": "69d66c15"
   },
   "outputs": [],
   "source": [
    "# Simple two means estimate and calculation of variance\n",
    "def twomeans(y, D):\n",
    "    hat0 = np.mean(y[D == 0])  # mean of outcome of un-treated\n",
    "    hat1 = np.mean(y[D == 1])  # mean of outcome of treated\n",
    "    V0 = np.var(y[D == 0]) / np.mean(1 - D)  # asymptotic variance of the mean of outcome of untreated\n",
    "    V1 = np.var(y[D == 1]) / np.mean(D)  # asymptotic variance of the mean of outcome of treated\n",
    "    hat = hat1 - hat0  # estimate of effect\n",
    "    stderr = np.sqrt((V0 + V1) / n)  # standard error of estimate of effect\n",
    "    return hat, stderr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "db73000b",
    "outputId": "2cf6c9cd-3d37-4770-94f1-f31487acc639"
   },
   "outputs": [],
   "source": [
    "np.random.seed(123)\n",
    "y, D, X = gen_data(n, d, .2, delta, base)  # generate RCT data\n",
    "twomeans(y, D)  # calculate estimation quantities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9",
   "metadata": {
    "id": "0f4535a9"
   },
   "outputs": [],
   "source": [
    "from sklearn.linear_model import LinearRegression\n",
    "\n",
    "\n",
    "# We implement the partialling out version of OLS (for pedagogical purposes)\n",
    "def partialling_out(y, D, W):\n",
    "    yres = y - LinearRegression().fit(W, y).predict(W)  # residualize outcome with OLS\n",
    "    Dres = D - LinearRegression().fit(W, D).predict(W)  # residualize treatment with OLS\n",
    "    hat = np.mean(yres * Dres) / np.mean(Dres**2)  # calculate final residual ~ residual ols estimate\n",
    "    epsilon = yres - hat * Dres  # calculate residual of final regression; epsilon in the BLP decomposition\n",
    "    V = np.mean(epsilon**2 * Dres**2) / np.mean(Dres**2)**2  # calculate variance of effect\n",
    "    return hat, np.sqrt(V / y.shape[0])  # return estimate and standard error"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "10",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "6f718a18",
    "outputId": "cc88a18a-868f-40b6-9b3a-ac540e699128"
   },
   "outputs": [],
   "source": [
    "partialling_out(y, D, np.hstack([D * X, X]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "11",
   "metadata": {
    "id": "84e660b2"
   },
   "outputs": [],
   "source": [
    "# Now we simply replace OLS with Lasso to implement the Double Lasso process\n",
    "def double_lasso(y, D, W):\n",
    "    yres = y - RLasso().fit(W, y).predict(W)  # residualize outcome with Lasso\n",
    "    Dres = D - RLasso().fit(W, D).predict(W)  # residualize treatment with Lasso\n",
    "    # rest is the same as in the OLS case\n",
    "    hat = np.mean(yres * Dres) / np.mean(Dres**2)\n",
    "    epsilon = yres - hat * Dres\n",
    "    V = np.mean(epsilon**2 * Dres**2) / np.mean(Dres**2)**2\n",
    "    return hat, np.sqrt(V / y.shape[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "12",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "c8b1cfd5",
    "outputId": "a0936aca-3897-45b3-a565-bc3f5dea6e5d"
   },
   "outputs": [],
   "source": [
    "double_lasso(y, D, np.hstack([D * X, X]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "13",
   "metadata": {
    "id": "3e58c0e9"
   },
   "outputs": [],
   "source": [
    "# We now check the distributional properties of the different estimators across experiments\n",
    "# First is the simple two means estimate\n",
    "\n",
    "# we will keep track of coverage (truth is in CI) and of the point estimate and stderr\n",
    "cov, hats, stderrs = [], [], []\n",
    "for _ in range(100):\n",
    "    y, D, X = gen_data(n, d, .2, delta, base)\n",
    "    hat, stderr = twomeans(y, D)\n",
    "    ci = [hat - 1.96 * stderr, hat + 1.96 * stderr]  # 95% confidence interval\n",
    "    hats += [hat]\n",
    "    stderrs += [stderr]\n",
    "    cov += [(ci[0] <= delta) & (delta <= ci[1])]  # 1 if CI contains the true parameter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "14",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "67f2a63d",
    "outputId": "9e9ab5c8-2558-4137-9d1e-644808640768"
   },
   "outputs": [],
   "source": [
    "np.mean(cov)  # average coverage (should be .95 ideally)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "15",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 432
    },
    "id": "31e49a00",
    "outputId": "ee0d23c0-ce1a-4377-8f45-131ee301506e"
   },
   "outputs": [],
   "source": [
    "# distribution of estimates\n",
    "plt.hist(hats)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "16",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "81766f99",
    "outputId": "1dc11627-9f61-4a83-e61a-8951a534b8b0"
   },
   "outputs": [],
   "source": [
    "# mean of estimate; measures how biased the estimate is (should be =delta ideally)\n",
    "np.mean(hats)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "17",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "174f80f2",
    "outputId": "ec902785-1598-4f2d-fae1-c81bbcf33b66"
   },
   "outputs": [],
   "source": [
    "# standard deviation of estimates; should be close to the standard errors we calculated for the CIs\n",
    "np.std(hats)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "18",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "2789f752",
    "outputId": "ec2b7103-b7f3-4291-ef40-7b51132ca1e9"
   },
   "outputs": [],
   "source": [
    "np.mean(stderrs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "19",
   "metadata": {
    "id": "a3a5c66c"
   },
   "outputs": [],
   "source": [
    "# Let's repeat this for the partialling out process (OLS), controlling for X\n",
    "# we will keep track of coverage (truth is in CI) and of the point estimate and stderr\n",
    "cov, hats, stderrs = [], [], []\n",
    "for _ in range(100):\n",
    "    y, D, X = gen_data(n, d, .2, delta, base)\n",
    "    hat, stderr = partialling_out(y, D, X)\n",
    "    ci = [hat - 1.96 * stderr, hat + 1.96 * stderr]  # 95% confidence interval\n",
    "    hats += [hat]\n",
    "    stderrs += [stderr]\n",
    "    cov += [(ci[0] <= delta) & (delta <= ci[1])]  # 1 if CI contains the true parameter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "20",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "535ff01b",
    "outputId": "6bb6fe88-8b28-4f34-d881-1fd863eb697a"
   },
   "outputs": [],
   "source": [
    "np.mean(cov)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "21",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 432
    },
    "id": "e1a42254",
    "outputId": "4f1de30d-80a4-425f-8ae2-43993c638541"
   },
   "outputs": [],
   "source": [
    "plt.hist(hats)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "22",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "7c1cfa87",
    "outputId": "111d0e12-717d-47f4-8c90-6a2fe8e70928"
   },
   "outputs": [],
   "source": [
    "np.mean(hats)  # ols is heavily biased... mean of estimates very far from delta=1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "23",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "442740a1",
    "outputId": "61aaab80-4e56-4ca4-c8f5-6422c690faa3"
   },
   "outputs": [],
   "source": [
    "np.std(hats)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "24",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "cc4d40e2",
    "outputId": "fd12b66e-dd2d-4843-bde6-b9fd76d395ae"
   },
   "outputs": [],
   "source": [
    "# standard error severely under estimates the variance of the estimate; all this is due to overfitting\n",
    "np.mean(stderrs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "25",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "9e0a09f9",
    "outputId": "71b8046e-a42a-4930-fa01-fbd685cf1d78"
   },
   "outputs": [],
   "source": [
    "# Now let's try the double Lasso. Because it's computationally expensive\n",
    "# we'll do the experiments in parallel. Python makes parallelism very simple\n",
    "from joblib import Parallel, delayed  # we import these two functions\n",
    "\n",
    "\n",
    "# we wrap our experiment process in a function, which is supposed to run a\n",
    "# a single experiment\n",
    "def exp(it, n, d):\n",
    "    np.random.seed(it)  # we draw a different seed for each experiment\n",
    "    y, D, X = gen_data(n, d, .2, delta, base)  # we generate data\n",
    "    hat, stderr = double_lasso(y, D, X)  # we apply the double lasso process\n",
    "    ci = [hat - 1.96 * stderr, hat + 1.96 * stderr]\n",
    "    # return estimate, standard error and (1 if CI contains the true parameter)\n",
    "    return hat, stderr, (ci[0] <= delta) & (delta <= ci[1])\n",
    "\n",
    "\n",
    "# Now here is how you run any function in parallel multiple times\n",
    "# It's a simple parallel for loop.\n",
    "res = Parallel(n_jobs=-1, verbose=3)(delayed(exp)(it, n, d) for it in range(100))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "26",
   "metadata": {
    "id": "45cdfbba"
   },
   "outputs": [],
   "source": [
    "# This simply takes the list of triples and turns it into a triple of lists :)\n",
    "# good trick to know\n",
    "hats, stderrs, cov = zip(*res)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "27",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "48544788",
    "outputId": "d564ef72-8399-446c-c284-35332aed2f9e"
   },
   "outputs": [],
   "source": [
    "np.mean(cov)  # much better coverage than OLS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "28",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 432
    },
    "id": "58334e8e",
    "outputId": "6ffab94e-6420-4dcf-b3d1-b80fbd138987"
   },
   "outputs": [],
   "source": [
    "plt.hist(hats)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "29",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "1fc08ca4",
    "outputId": "cc9c144b-10ef-4890-fabe-b2d96196f4fb"
   },
   "outputs": [],
   "source": [
    "np.mean(hats)  # much closer to 1... (almost the same as two-means)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "30",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "2194a541",
    "outputId": "490e83b4-0116-49b7-fc63-966d2ac731f5"
   },
   "outputs": [],
   "source": [
    "np.std(hats)  # standard deviation much smaller than two means, which did not adjust for X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "31",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "be61450d",
    "outputId": "2e38c07c-0e44-4de8-ebce-d4b5e1e87522"
   },
   "outputs": [],
   "source": [
    "np.mean(stderrs)  # and close to the calculate standard errors; we correctly estimated uncertainty"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "32",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "394859c2",
    "outputId": "6f2d95f4-eb2e-4e1f-8bb7-6c73b86274ec"
   },
   "outputs": [],
   "source": [
    "# Let's see what would happen if we just run a single lasso\n",
    "from joblib import Parallel, delayed\n",
    "\n",
    "\n",
    "def exp(it, n, d):\n",
    "    np.random.seed(it)\n",
    "    y, D, X = gen_data(n, d, .2, delta, base)\n",
    "    hat = RLasso().fit(np.hstack([D.reshape(-1, 1), X]), y).coef_[0]\n",
    "    return hat  # no obvious way to account for uncertainty\n",
    "\n",
    "\n",
    "res = Parallel(n_jobs=-1, verbose=3)(delayed(exp)(it, n, d) for it in range(100))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "33",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 430
    },
    "id": "893d95ba",
    "outputId": "f1a4b30d-612b-466b-ddce-0db27f815135"
   },
   "outputs": [],
   "source": [
    "plt.hist(res)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "34",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "f1711931",
    "outputId": "d14a8896-15c6-4a8d-d40f-f86a505e9bc2"
   },
   "outputs": [],
   "source": [
    "# bias is comparable and larger than standard deviation.\n",
    "# Even if we could estimate the standard deviation, confidence intervals would undercover\n",
    "1 - np.mean(res), np.std(res)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "35",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "3fb42054",
    "outputId": "e0ef5f96-eeae-42e5-d3c6-58fad785a26a"
   },
   "outputs": [],
   "source": [
    "# Let's try adding a post-Lasso OLS step and construct confidence\n",
    "# intervals ignoring the Lasso step\n",
    "from joblib import Parallel, delayed\n",
    "\n",
    "\n",
    "def exp(it, n, d):\n",
    "    np.random.seed(it)\n",
    "    y, D, X = gen_data(n, d, .2, delta, base)\n",
    "    # run a big lasso y ~ D, X\n",
    "    DX = np.hstack([D.reshape(-1, 1), X])\n",
    "    coefs = RLasso().fit(DX, y).coef_[1:]\n",
    "    # run OLS on y ~ D, X[chosen by lasso]\n",
    "    # calculate standard error as if lasso step never happened\n",
    "    hat, stderr = partialling_out(y, D - np.mean(D), X[:, np.abs(coefs) > 0.0])\n",
    "    ci = [hat - 1.96 * stderr, hat + 1.96 * stderr]\n",
    "    return hat, stderr, (ci[0] <= delta) & (delta <= ci[1])\n",
    "\n",
    "\n",
    "res = Parallel(n_jobs=-1, verbose=3)(delayed(exp)(it, n, d) for it in range(100))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "36",
   "metadata": {
    "id": "25303e2a"
   },
   "outputs": [],
   "source": [
    "hats, stderrs, cov = zip(*res)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "37",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "68b51670",
    "outputId": "94127758-b0aa-4f09-bcc3-1b42d8bf1b2e"
   },
   "outputs": [],
   "source": [
    "np.mean(cov)  # not bad"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "38",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 430
    },
    "id": "f538442b",
    "outputId": "31a8bb58-6ea9-4636-8e92-3db912d8c346"
   },
   "outputs": [],
   "source": [
    "plt.hist(hats)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "39",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "3c71cd7d",
    "outputId": "de5e9a8e-c901-4dbb-c0a9-87307f562a05"
   },
   "outputs": [],
   "source": [
    "1 - np.mean(hats), np.std(hats)  # quite un-biased; bias < standard deviation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "40",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "b74cf552",
    "outputId": "d3e359fd-4c4b-4b4f-bee2-0e5c80ad1f9b"
   },
   "outputs": [],
   "source": [
    "# we under-estimated a bit the uncertainty; smaller estimated stderr than true std.\n",
    "# this is most prob a finite sample error, from ignoring the lasso variable selection step\n",
    "# this is an RCT and so even post lasso ols is Neyman orthogonal. We should expect good behavior.\n",
    "np.mean(stderrs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "41",
   "metadata": {
    "id": "b0eeedd3"
   },
   "outputs": [],
   "source": [
    "# But what if we are not in an RCT!?\n",
    "def gen_data(n, d, p, delta, base):\n",
    "    X = np.random.normal(0, 1, size=(n, d))\n",
    "    D = X[:, 0] + np.random.normal(0, 1 / 4, size=(n,))\n",
    "    y = delta * D + base - X[:, 0] + np.random.normal(0, 1, size=(n,))\n",
    "    return y, D, X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "42",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "dd843f2e",
    "outputId": "35794f1b-f3d2-47f4-d810-9e805f839ad1"
   },
   "outputs": [],
   "source": [
    "from joblib import Parallel, delayed\n",
    "\n",
    "\n",
    "def exp(it, n, d):\n",
    "    np.random.seed(it)\n",
    "    y, D, X = gen_data(n, d, .2, delta, base)\n",
    "    DX = np.hstack([D.reshape(-1, 1), X])\n",
    "    coefs = RLasso().fit(DX, y).coef_[1:]\n",
    "    hat, stderr = partialling_out(y, D, X[:, np.abs(coefs) > 0.0])\n",
    "    ci = [hat - 1.96 * stderr, hat + 1.96 * stderr]\n",
    "    return hat, stderr, (ci[0] <= delta) & (delta <= ci[1])\n",
    "\n",
    "\n",
    "res = Parallel(n_jobs=-1, verbose=3)(delayed(exp)(it, n, d) for it in range(100))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "43",
   "metadata": {
    "id": "26f93c1e"
   },
   "outputs": [],
   "source": [
    "hats, stderrs, cov = zip(*res)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "44",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "b30e1d54",
    "outputId": "fdf24de6-a6ca-46f1-a5a1-54d834f32b24"
   },
   "outputs": [],
   "source": [
    "np.mean(cov)  # Oops! Post Lasso OLS severely undercovers; It is not Neyman orthogonal when D is correlated with X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "45",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 430
    },
    "id": "ea9e640f",
    "outputId": "620f1c0b-013d-4912-e3c8-5396779d23aa"
   },
   "outputs": [],
   "source": [
    "plt.hist(hats)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "46",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "a725e335",
    "outputId": "0624e1e1-cacf-4453-e1c1-4085317bd002"
   },
   "outputs": [],
   "source": [
    "np.mean(hats)  # very heavily biased"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "47",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "5d52c197",
    "outputId": "c5b6e722-dc34-4ab2-be27-4c393583c2b1"
   },
   "outputs": [],
   "source": [
    "# But let's try the Neyman orthogonal Double Lasso\n",
    "from joblib import Parallel, delayed\n",
    "\n",
    "\n",
    "def exp(it, n, d):\n",
    "    np.random.seed(it)\n",
    "    y, D, X = gen_data(n, d, .2, delta, base)\n",
    "    hat, stderr = double_lasso(y, D, X)  # we apply the double lasso process\n",
    "    ci = [hat - 1.96 * stderr, hat + 1.96 * stderr]\n",
    "    return hat, stderr, (ci[0] <= delta) & (delta <= ci[1])\n",
    "\n",
    "\n",
    "res = Parallel(n_jobs=-1, verbose=3)(delayed(exp)(it, n, d) for it in range(100))\n",
    "hats, stderrs, cov = zip(*res)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "48",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "3c289eb3",
    "outputId": "f2160f87-36d8-47f7-e57f-ebcdaa305a19"
   },
   "outputs": [],
   "source": [
    "np.mean(cov)  # great coverage"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "49",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "add24767",
    "outputId": "3a127657-014d-484b-86d6-c61af841b074"
   },
   "outputs": [],
   "source": [
    "1 - np.mean(hats), np.std(hats)  # very small bias compared to standard deviation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "50",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "8a89f4c7",
    "outputId": "26ac3037-ca95-4428-9897-ee62e62c01c7"
   },
   "outputs": [],
   "source": [
    "np.mean(stderrs)  # accurate estimation of uncertainty"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "51",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 430
    },
    "id": "d33463a5",
    "outputId": "ea232351-c122-43df-ce70-a10e503818d9"
   },
   "outputs": [],
   "source": [
    "# Approximately normal distribution of estimates, centered at the truth\n",
    "plt.hist(hats)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "52",
   "metadata": {
    "id": "e3ae78ae"
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "colab": {
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
